# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
from urllib.parse import urljoin
from news.items import QQItem


class QQSpider(scrapy.Spider):
    name = 'qq'
    allowed_domains = ['news.qq.com']
    start_urls = ['https://news.qq.com/']
    image_scheme = 'http://'

    custom_settings = {
        'ITEM_PIPELINES': {
            'news.pipelines.QQCheckEmptyPipeline': 10,
            'scrapy.pipelines.images.ImagesPipeline': 100,
            'news.pipelines.QQSavePipeline': 300,
        }
    }

    def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url,
                                self.parse,
                                args={'wait': 0.5, 'viewport': '1024x6480', 'timeout': 90, 'images': 0, 'resource_timeout': 10},
                                )

    def parse(self, response):
        sort = 1
        for selector in response.xpath('//div[@id="List"]//ul[@class="list"]/li[contains(@class, "cf")]'):
            item = QQItem()
            item["title"] = selector.xpath('.//h3/a/text()').get()
            item["link"] = selector.xpath('.//h3/a/@href').get()
            item["preview_images"] = selector.xpath('.//img/@src').getall()
            item["author"] = selector.xpath('.//a[@class="source"]/text()').get()
            item["author_url"] = selector.xpath('.//a[@class="source"]/@href').get()
            item["publish_time"] = selector.xpath('.//span[@class="time"]/text()').get()
            item["sort"] = sort

            item["image_urls"] = [urljoin(self.image_scheme, url) for url in item["preview_images"]]

            item["update_time"] = self.crawler.now

            sort += 1
            yield item
